R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Cleaning

Semester Data

library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##load the dataset
semester_data_1 <- read_csv("raw/semester_dummy/semester_data_1.csv")
## Rows: 6632 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): x1, x2, x3, x4, x5, x6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
semester_data_2 <- read_csv("raw/semester_dummy/semester_data_2.csv")
## Rows: 7258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): x2
## dbl (5): x1, x3, x4, x5, x6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(semester_data_1)
head(semester_data_2)
##rename the column names in "semester_data_1"
semester_data_1 <- semester_data_1 |>
  rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
head(semester_data_1)
##convert the character into double (numeric)
##then, delete the first row of the "semester_data_1"

semester_data_1 <- semester_data_1 |>
  mutate(unitid = as.numeric(unitid), semester = as.numeric(semester), quarter = as.numeric(quarter), year = as.numeric(year), Y = as.numeric(Y)) |>
  slice(-1)
## Warning: There were 5 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `unitid = as.numeric(unitid)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings.
head(semester_data_1)
##rename the column names in "semester_data_2"
semester_data_2 <- semester_data_2 |>
  rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
head(semester_data_2)
##join the two data sets with bind_rows() because they have exactly the same columns
semester_data <- bind_rows(semester_data_1, semester_data_2)
head(semester_data)
##delete the column "Y"
semester_data <- semester_data |>
  select (-Y) #|>
  ##mutate(instnm = toupper(instnm))
head(semester_data)
##identify the transitioned year for each university
transition_year <- semester_data |>
  group_by(unitid) |>
  mutate(prev_semester = lag(semester),
         prev_quarter = lag(quarter))  |>
  filter(prev_semester == 0 & semester == 1 & prev_quarter == 1 & quarter == 0) |>
  rename(yearofsem = year) |>
  select (unitid, yearofsem) #should not add "instnm" here as upper cases and lower cases are mixed
head(transition_year)
##use left_join to merge the two data sets ("semester_data" and "transition_year")
semester_dummy <- semester_data |>
  left_join (transition_year, by = "unitid") #joined only by "unitid"
head(semester_dummy)
##finalizing the cleaning of the semester data
##create a dummy variable which shows 0 if semesters were yet to be introduced and 1 if already introduced
cleaned_semester_dummy <- semester_dummy |>
  group_by(unitid) |>
  mutate(after = if_else(year < yearofsem, "0", "1")) |>
  mutate(after = as.numeric(after))
cleaned_semester_dummy
##checking the number of NAs in "cleaned_semester_dummy"
#sum(is.na(cleaned_semester_dummy)) #total number of NAs: 25688

##the semester data set provided by the organizer
#semester_by_organizer <- read_csv("intermediate_by_organizer/clean_semester_dummy.csv") |>
  #select(-1)
#s#emester_by_organizer
#checking the number of NAs in "semester_by_organizer"
#sum(is.na(semester_by_organizer)) #identical with "cleaned_semester_dummy"
##save this cleaned data set "cleaned_semester_dummy" as a .csv file
##remember to add ".csv" at the end
#write_csv(cleaned_semester_dummy, "cleaned_semester_aomi.csv")

Gradrate Data (Outcome)

##use the intermediate data for the tentative use

##create empty tibble
df_gradrate <- tibble()

##loop through each year from 1991 to 2016
for (i in c(c(1991:1993), c(1995:2016))) { #excluding 1994
  #construct the filename
  filename <- paste0("raw/outcome_csv_aomi/", i, ".csv")
  gradrate_data <- read_csv(filename)
  df_gradrate <- bind_rows(df_gradrate, gradrate_data)  #repeat binding "df_gradrate" and "gradrate_data"
}
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1249 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1250 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1248 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1247 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1244 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1241 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_gradrate)
##calculate female's 4-year graduation rate
df_gradrate <- df_gradrate |>
  mutate(women_gradrate_4yr = as.numeric(women_gradrate_4yr)) |>
  mutate(womengradrate4yr = 0.01 * women_gradrate_4yr)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `women_gradrate_4yr = as.numeric(women_gradrate_4yr)`.
## Caused by warning:
## ! NAs introduced by coercion
head(df_gradrate)
#calculate total 4-year graduation rate, then covert into numeric factors
#calculate male's 4-year graduation rate, then covert into numeric factors
df_gradrate <- df_gradrate |>
  mutate(gradrate4yr = tot4yrgrads / totcohortsize,
         mengradrate4yr = m_4yrgrads / m_cohortsize)
head(df_gradrate)
##use round() to control the number of digit
df_gradrate <- df_gradrate |>
  mutate(gradrate4yr = round(gradrate4yr, digits = 3)) |>
  mutate(mengradrate4yr = round(mengradrate4yr, digits = 3))
head(df_gradrate)
##limit the range of years in the data set
outcome_data <- df_gradrate |>
  filter (year >= 1991 & year <= 2010)
head(outcome_data)
#write_csv(outcome_data, "cleaned_outcome_aomi.csv")

Covariates Data

##load the data
covariates <- read_csv("raw/covariates_csv_aomi/covariates.csv")
## Rows: 149408 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university_id, category
## dbl (2): year, value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(covariates)
##rename the column name from "university_id" into "unitid"
##then, create a new column with the same name while removing "aaaa" from each row
covariates <- covariates |>
  rename(unitid = university_id) |>
  mutate(unitid = str_remove_all(unitid, "aaaa"))
head(covariates)
##see all the unique values in the "category" column
#unique(covariates$category)

##use "pivot_wider" to transfer the dataset
covariates <- covariates |>
  pivot_wider(names_from = category,
              values_from = value)
head(covariates)
##recall "cleaned_semester_dummy"
#cleaned_semester_dummy
##recall "outcome_data"
#outcome_data
##see all the unique values existing in the "year" column of "cleaned_semester_dummy" data
unique(cleaned_semester_dummy$year)
##  [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##see all the unique values existing in the "year" column of "outcome_data"
unique(outcome_data$year)
##  [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##limit the year duration for "covariates" data
covariates <- covariates |>
  group_by(unitid) |>
  filter (year >= 1991 & year != 1994 & year <= 2010)
head(covariates)
##see all the unique values existing in the "unitid" column in the "outcome_data" data
#unique(outcome_data$unitid)
##see all the unique values existing in the "unitid" column in the "covariates" data
#unique(covariates$unitid)

##filter the "unitid" column to keep only the rows that have the same value in "outcome_data" data set.

#pipe operators cannot be used within functions
#"unitid == outcome_data$unitid": This checks if unitid is equal to the entire vector outcome$unitid, which is not what you want if outcome$unitid contains multiple values. It would result in a logical vector of TRUE/FALSE that doesn't directly help in filtering rows.
#"unitid %in% outcome_data$unitid": This checks if each value in unitid is present in the outcome$unitid vector. It creates a logical vector where each element is TRUE if the corresponding unitid value is found in outcome$unitid and FALSE otherwise. This is useful for filtering rows based on the presence of values in a list or vector.

covariates_data <- covariates |>
  filter(unitid %in% (outcome_data$unitid)) |>
  #convert characters into double ("unitid" column)
  mutate(unitid = as.numeric(unitid))
head(covariates_data)
#write_csv(covariates_data, "cleaned_covariates_aomi.csv")

Master Data

##Semester Data: cleaned_semester_dummy
##Gradrate Data: outcome_data
##Covariates Data: covariates_data
cleaned_semester_dummy
outcome_data
covariates_data
##combine three data sets by "unitid (double)"
pre_master_data <- cleaned_semester_dummy |>
  left_join(outcome_data, by = c("unitid", "year"))
pre_master_data
master_data <- pre_master_data |>
  left_join(covariates_data, by = c("unitid", "year")) |>
  #calculate the ratio of women cohorts and white cohorts, respectively
  mutate(per_white_cohort = white_cohortsize / totcohortsize,
         per_women_cohort = w_cohortsize / totcohortsize)
master_data
### Compare my cleaned data and data by organizer (covariates data) : identical!
##mine
#sum(is.na(covariates_data)) #total number of NAs: 0

##by organizer
#covariates_by_organizer <- read_csv("intermediate_by_organizer/clean_covariates.csv")
#sum(is.na(covariates_by_organizer)) #total number of NAs: 0
#write_csv(master_data, "master_data_aomi.csv")

Analysis

##count the total number of NAs in "master_data"
master_data
#sum(is.na(master_data)) #total number of NAs: 25801

##load the cleaned master data provided by the organizers
#Master <- read_csv("intermediate_by_organizer/master.csv")
##count the number of NAs in each column in "master_data"
master_data |>
  apply(2, function(x) sum(is.na(x))) #1 for row, 2 for column
##             unitid             instnm           semester            quarter 
##                  0                  0                  0                  0 
##               year          yearofsem              after      totcohortsize 
##                  0              12844              12844                  0 
##       w_cohortsize       m_cohortsize        tot4yrgrads         m_4yrgrads 
##                  0                  0                  0                  0 
##         w_4yrgrads women_gradrate_4yr   womengradrate4yr        gradrate4yr 
##                  0                 24                 24                  0 
##     mengradrate4yr     instatetuition              costs            faculty 
##                 65                  0                  0                  0 
##   white_cohortsize   per_white_cohort   per_women_cohort 
##                  0                  0                  0

Replicate Table 1

##switchers: TRUE / never-switchers: FALSE
#abs(): absolute value

transition <- master_data |>
  group_by(unitid) |>
  mutate(semester_change = abs(semester - lag(semester, default = first(semester))), 
         quarter_change = abs(quarter - lag(quarter, default = first(quarter))),
         switching = (semester_change + quarter_change) > 0)  |>
  filter(switching == TRUE) |>
  select(unitid, switching)

##table: dataset containing the column called "switching"
table <- master_data |>
  left_join(transition, by = "unitid") |>
  mutate(switching = if_else(is.na(switching),"Never-Switchers", "Switchers"))
table
#library(table1)

Replicate Figure 1: 4-year graduation rate

library(ggplot2)

avg_gradrate4yr <- master_data |>
  group_by(year) |>
  mutate(avg_gradrate4yr = mean(gradrate4yr))

avg_gradrate4yr |>
  ggplot(mapping = aes(x = year,
                       y = avg_gradrate4yr)) +
  geom_line() +
  xlim(1990, 2010) + #set the minimum and maximum limits on the x-axis
  ylim(0.25, 0.45) +
  labs(x = "Year",
       y = "4-year Graduation Rate")

Replicate Figure 1: fraction of schools on semesters

semesters_share <- master_data |>
  group_by(year) |>
  mutate(semesters_share = sum(semester) / n())

semesters_share |>
  ggplot(mapping = aes(x = year, 
                       y = semesters_share)) +
  geom_line() +
  xlim(1990, 2010) + 
  ylim(0.8, 1.0) +
  labs (x = "Year",
        y = "Fractions of Schools on Semesters") 

###Scatter Plot
#colors()

##if creating the scatter plots manually:
#master_data |>
  #ggplot(mapping = aes(x = per_women_cohort, y = gradrate4yr)) +
  #geom_point(color = "skyblue", alpha = 0.5)

#master_data |>
  #ggplot(mapping = aes(x = per_white_cohort, y = gradrate4yr)) +
  #geom_point(color = "violet", alpha = 0.5)

#master_data |>
  #ggplot(mapping = aes(x = instatetuition, y = gradrate4yr)) +
  #geom_point(color = "springgreen", alpha = 0.5)

##if using rlang package
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
library(ggplot2)


plot_gradrate <- function(data, xvar, color_value, xaxis) {
  xvar <- enquo(xvar) #quosing
  
  ggplot(data, mapping = aes(x = !!xvar, y = gradrate4yr)) + #dynamic
    geom_point(color = color_value, alpha = 0.5) + #static
    labs(x = xaxis, #static
       y = "4-year Graduation Rate")
}

plot_gradrate(master_data, per_women_cohort, "skyblue", "Female Student Shares")

plot_gradrate(master_data, per_white_cohort, "violet", "White Student Shares")

plot_gradrate(master_data, instatetuition, "springgreen", "Tuition")

### Regression Analysis

##Independent Variable: semester
##Dependent Variable: gradrate4yr
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
##run linear regression
model <- lm(gradrate4yr ~ semester, data = master_data)
summary(model)
## 
## Call:
## lm(formula = gradrate4yr ~ semester, data = master_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37209 -0.18809 -0.03809  0.16391  0.58991 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.336755   0.007295  46.162  < 2e-16 ***
## semester    0.036336   0.007560   4.806 1.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2257 on 13887 degrees of freedom
## Multiple R-squared:  0.001661,   Adjusted R-squared:  0.001589 
## F-statistic:  23.1 on 1 and 13887 DF,  p-value: 1.555e-06
##controling per_women_cohort, per_white_cohort, instatetuition
#model_2 <- lm(gradrate4yr ~ semester + per_women_cohort + per_white_cohort + instatetuition, data = master_data)
#summary(model_2)
##create and display tables
modelsummary(model)
tinytable_2thfvzredujj3y5kxsgu
(1)
(Intercept) 0.337
(0.007)
semester 0.036
(0.008)
Num.Obs. 13889
R2 0.002
R2 Adj. 0.002
AIC -1932.3
BIC -1909.7
Log.Lik. 969.166
RMSE 0.23